import requests
from bs4 import BeautifulSoup
import re

domain_url = 'https://kaoyan.koolearn.com/20210414/1439075.html'
print("正在从目标url爬取数据:", domain_url)
domain_requests = requests.get(domain_url)
domain_requests.encoding = 'utf-8'
domain_html_txt = domain_requests.text
domain_html = BeautifulSoup(domain_html_txt, 'html.parser')

output = []
isEntry = re.compile("首字母.的单词")
for entry_soup in domain_html.find('div', class_="xqy_core_text").table.tbody.find_all('a'):
    if(isEntry.match(entry_soup.string)):
        entry_url = entry_soup['href']
        print("正在爬",entry_soup.string,":",entry_url)
        entry_requests = requests.get(entry_url)
        entry_requests.encoding = 'utf-8'
        for line in BeautifulSoup(entry_requests.text, 'html.parser').find('div', class_="xqy_core_text").find_all('p')[1:]:
            if(line.string != None):
                line = line.string.strip()
                if(line[0:4] != "以上就是"):
                    output.append(line)

output_filename = "KaoYanWords.csv"
with open(output_filename, "w+") as dest:
    for line in output:
        # 第一刀分离数字和英文,第二刀分离英文和中文
        list_of_str = line.split(sep=" ",maxsplit=2)
        # 抛掉数字
        list_of_str = list_of_str[1:]
        # 中文部分去掉逗号,换成分号,因为csv格式要用逗号隔开
        if(len(list_of_str) >= 2):
            list_of_str[1] = list_of_str[1].replace("，",";") #中文的逗号
            list_of_str[1] = list_of_str[1].replace(",",";")  #英文的逗号
        # 英文大写转小写
        list_of_str[0] = list_of_str[0].lower()
        # 用逗号隔开,这就是csv格式
        dest.write(",".join(list_of_str))
        dest.write("\n")

print("输出保存在:", output_filename)
